In [1]:
kernel.silent(true)

Dependencies

In [2]:
import coursierapi.MavenRepository
interp.repositories() ++= Seq(MavenRepository.of("https://jitpack.io"))
In [3]:
import $ivy.`com.github.propi:rdfrules:1.5.0`
//import $ivy.`com.github.propi.rdfrules::core:1.0.0`
import collection._
import org.apache.jena.riot.Lang
import scala.util.control.Breaks._
import scala.collection.immutable.ListMap
In [4]:
import $ivy.`org.plotly-scala::plotly-almond:0.8.2`
import plotly._, plotly.element._, plotly.layout._, plotly.Almond._
init(offline=true)
repl.pprinter() = repl.pprinter().copy(defaultHeight = 3)
In [5]:
import com.github.propi.rdfrules.data._
import com.github.propi.rdfrules.algorithm.amie._
import com.github.propi.rdfrules.algorithm.dbscan._
import com.github.propi.rdfrules.utils._
import com.github.propi.rdfrules.index._
import com.github.propi.rdfrules.rule._
import com.github.propi.rdfrules.ruleset._

Data Sets

In [6]:
val rdfsLabel = "http://www.w3.org/2000/01/rdf-schema#label"
val rdfsComment = "http://www.w3.org/2000/01/rdf-schema#comment"
val alternateName = "http://schema.org/alternateName"
val image = "http://schema.org/image"
val rdfType = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
val rounded = (value: Double, scale: Integer) => BigDecimal(value).setScale(scale, BigDecimal.RoundingMode.HALF_UP).toDouble
In [7]:
// udělej to barbarsyk, co cache to dataset, to index ...
val regionTotalSlice = Dataset.fromCache("../cache/jaur/jaur-regions-total.cache")
val regionBySexSlice = Dataset.fromCache("../cache/jaur/jaur-regions-bysex.cache")
val districtTotalSlice = Dataset.fromCache("../cache/jaur/jaur-districts-total.cache")
val districtBySexSlice = Dataset.fromCache("../cache/jaur/jaur-districts-bysex.cache")
In [ ]:
val yagoHop0 = Dataset.fromCache("../cache/yago-hop0.cache")
yagoHop0.size
val yagoHop1 = Dataset.fromCache("../cache/yago-hop1.cache")
yagoHop1.size
val yagoHop2 = Dataset.fromCache("../cache/yago-hop2.cache")
yagoHop2.size
In [ ]:
val yagoDataset = yagoHop0 + yagoHop1 + yagoHop2
yagoDataset.size
In [ ]:
val yagoDatasetFiltered = yagoDataset.
filter(q => !q.triple.predicate.hasSameUriAs(rdfsLabel) &&
                !q.triple.predicate.hasSameUriAs(rdfsComment) &&
                !q.triple.predicate.hasSameUriAs(alternateName) &&
                !q.triple.predicate.hasSameUriAs(rdfType) &&
                !q.triple.predicate.hasSameUriAs(image))

val ratio: Double = (yagoDatasetFiltered.size.toDouble / yagoDataset.size.toDouble)
yagoDatasetFiltered.size + " / " + yagoDataset.size + " = " + rounded(ratio,2)*100 + "%"
In [ ]:
val refAreaLinking = Dataset("../data/linking/yagoCZSOLinking.ttl")
In [ ]:
val regionTotalDataset = regionTotalSlice + yagoDatasetFiltered + refAreaLinking
val regionBySexDataset = regionBySexSlice + yagoDatasetFiltered + refAreaLinking
val districtTotalDataset = districtTotalSlice + yagoDatasetFiltered + refAreaLinking
val districtBySexDataset = districtBySexSlice + yagoDatasetFiltered + refAreaLinking
In [ ]:
val regionTotalIndex = regionTotalDataset.index().cache("../cache/jaur-yago/regionTotalIndex.cache")
val regionBySexIndex = regionBySexDataset.index().cache("../cache/jaur-yago/regionBySexIndex.cache")
val districtTotalIndex = districtTotalDataset.index().cache("../cache/jaur-yago/districtTotalIndex.cache")
val districtBySexIndex = districtBySexDataset.index().cache("../cache/jaur-yago/districtBySexIndex.cache")
In [8]:
val regionTotalIndex = Index.fromCache("../cache/jaur-yago/regionTotalIndex.cache",false)
val regionBySexIndex = Index.fromCache("../cache/jaur-yago/regionBySexIndex.cache",false)
val districtTotalIndex = Index.fromCache("../cache/jaur-yago/districtTotalIndex.cache",false)
val districtBySexIndex = Index.fromCache("../cache/jaur-yago/districtBySexIndex.cache",false)

Mining

In [9]:
val uri = (value: String) => TripleItem.Uri(value)
val qbDataSet = uri("http://purl.org/linked-data/cube#dataSet")
val czsoUri = "http://data.czso.cz/ontology/"
val refArea = uri("http://data.czso.cz/ontology/refArea")
val constantsAtObject = RuleConstraint.ConstantsAtPosition.ConstantsPosition.Object
val constantsOnlyAtObject = RuleConstraint.ConstantsAtPosition(constantsAtObject)
In [10]:
// in all cubes
val unemploymentRate = uri(czsoUri+"podilNezamestnanych")
val reachableApplicants = uri(czsoUri+"dosazitelniNeumisteniUchazeciOZamestnani")
// only in total cubes
val unplacedApplicants = uri(czsoUri+"neumisteniUchazeciOZamestnani")
val vacaniesCount = uri(czsoUri+"pocetVolnychMist")
val measures = Array(unemploymentRate,reachableApplicants,unplacedApplicants,vacaniesCount)
val oneOfAllMeasures = OneOf(unemploymentRate,reachableApplicants,unplacedApplicants,vacaniesCount)
val oneOfBySexMeasures = OneOf(unemploymentRate,reachableApplicants)
In [11]:
val regionTotalSliceUri = uri("jaur-regions-total")
val regionBySexSliceUri = uri("jaur-regions-bysex")
val oneOfRegionCubes = OneOf(regionTotalSliceUri,regionBySexSliceUri)
val districtTotalSliceUri = uri("jaur-districts-total")
val districtBySexSliceUri = uri("jaur-districts-bysex")
val oneOfDistrictCubes = OneOf(districtTotalSliceUri,districtBySexSliceUri)

Regions Total

In [12]:
val regionTotalPattern = (
    AtomPattern(subject = 'b', graph = uri("yago")) &:
    AtomPattern(subject = 'a', predicate = refArea, `object` = 'b', graph = uri("czso")) &:
    AtomPattern(subject = 'a', predicate = qbDataSet, `object` = regionTotalSliceUri, graph = uri("czso"))
    =>: 
    AtomPattern(subject = 'a', predicate = oneOfAllMeasures, graph = uri("czso"))
)
In [13]:
val minSupport = (d: Dataset) => {
    val n = d.filter(q => q.triple.predicate == qbDataSet).size
    val nArea = d.filter(q => q.triple.predicate == refArea).triples.map(t => t.`object`).toSet.size
    n / nArea
}
In [14]:
val regionTotalTask = Amie()
    .addThreshold(Threshold.MinSupport(minSupport(regionTotalSlice)))
    .addThreshold(Threshold.MaxRuleLength(6))
    .addThreshold(Threshold.MinHeadSize(0))
    .addConstraint(constantsOnlyAtObject)
    .addPattern(regionTotalPattern)
In [15]:
val startTimeMillis = System.currentTimeMillis()
val regionTotalTaskRuleset = regionTotalIndex.mine(regionTotalTask)
println("rules: "+regionTotalTaskRuleset.size)
println("duration: " + (System.currentTimeMillis() - startTimeMillis) / 1000 + "s")
2021-05-20 16:57:01:743 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Predicates trimming.
2021-05-20 16:57:02:440 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Subjects indexing.
2021-05-20 16:57:03:403 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Subjects trimming.
2021-05-20 16:57:03:658 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Objects indexing.
2021-05-20 16:57:04:389 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Objects trimming.
2021-05-20 16:57:04:542 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Amie task settings:
MinHeadSize=1,
MinHeadCoverage=0.0,
MinSupport=9,
MaxThreads=4,
MinAtomSize=0,
MaxRuleLength=6,
WithConstants=true,
ConstantsPosition=Object,
Timeout=-1,
WithDuplicitPredicates=true,
Patterns=List(Mapped(Vector(Mapped(Variable(?b),Any,Any,Constant(Constant(-1596161386))), Mapped(Variable(?a),Constant(Constant(1481837794)),Variable(?b),Constant(Constant(1015601977))), Mapped(Variable(?a),Constant(Constant(624690160)),Constant(Constant(1687262354)),Constant(Constant(1015601977)))),Some(Mapped(Variable(?a),OneOf(ArrayBuffer(Constant(Constant(-2070273298)), Constant(Constant(1659106226)), Constant(Constant(1797717682)), Constant(Constant(1142069620)))),Any,Constant(Constant(1015601977)))),false,false)),
OnlyPredicates=None,
WithoutPredicates=None
rules: 19890
duration: 21s
In [16]:
regionTotalTaskRuleset.export("../rulesets/jaur-yago/regionTotal.txt")
In [18]:
val filterRuleset = (r: Ruleset) => r.filterResolved(rr => {rr.body.count(i => i.predicate == refArea) == 1})

val regionTotalTaskRulesetFiltered = filterRuleset(regionTotalTaskRuleset)
.cache
regionTotalTaskRulesetFiltered.export("../rulesets/jaur-yago/regionTotalFiltered.txt")
println("rules: " + regionTotalTaskRulesetFiltered.size)
rules: 19581
In [19]:
val plotHistogram = (seq: Seq[Double], color: String) => {
    val data = Seq(plotly.Histogram(seq,marker = Marker(color = Color.StringColor(color),opacity = 0.6)))
    plot(data)
}

val measureSequence = (r: Ruleset, m: TypedKeyMap.Key[Measure]) => r
.resolvedRules
.map(r => r.measures.get(m).get)
.toSeq

val supportSeq = measureSequence(regionTotalTaskRulesetFiltered, Measure.Support)
.map(m => m.asInstanceOf[Measure.Support].value.toDouble)
plotHistogram(supportSeq, "green")
In [20]:
val computeConfidence = (r: Ruleset, minConf: Double) => r
.computePcaConfidence(minConf)
.sortBy(Measure.PcaConfidence, Measure.Support)

val regionTotalTaskRulesetConfComputed = computeConfidence(regionTotalTaskRulesetFiltered,0.0).cache
regionTotalTaskRulesetConfComputed.export("../rulesets/jaur-yago/regionTotalConfComputed.txt")

val confSeq = measureSequence(regionTotalTaskRulesetConfComputed, Measure.PcaConfidence)
.map(m => m.asInstanceOf[Measure.PcaConfidence].value)
plotHistogram(confSeq, "green")
In [21]:
val computeLift = (r: Ruleset, minLift: Double) => r
.computeLift(minLift)
.sortBy(Measure.Lift, Measure.Support)

val regionTotalTaskRulesetLiftComputed = computeLift(regionTotalTaskRulesetFiltered,0.0).cache
regionTotalTaskRulesetLiftComputed.export("../rulesets/jaur-yago/regionTotalLiftComputed.txt")

val liftSeq = measureSequence(regionTotalTaskRulesetLiftComputed, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "green")
In [22]:
val regionTotalTaskRulesetMinLift = regionTotalTaskRulesetLiftComputed
.filter(r => r.measures.get(Measure.Lift).get.value >= 1.0)
.pruned(false,false)
.cache

println("rules: " + regionTotalTaskRulesetMinLift.size)
regionTotalTaskRulesetMinLift.export("../rulesets/jaur-yago/regionTotalMinLift.txt")

val liftSeq = measureSequence(regionTotalTaskRulesetMinLift, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "green")
rules: 336
In [23]:
val filterByLength = (r: ResolvedRule, length: Int) => r.body.size == length - 1

val groupRulesByLength = (r: Ruleset, minLength: Int, maxLength: Int) => {
    var lengthMap: ListMap[Int, Int] = ListMap()
    breakable {
        var counter = minLength
        while (counter <= maxLength) {
            val size = r.filterResolved(r => filterByLength(r,counter)).size
            lengthMap = lengthMap + (counter -> size)
            counter = counter + 1
        }
    }
    ListMap(lengthMap.toSeq.sortWith(_._2 > _._2):_*)
}

val plotHorizontalBar = (map: ListMap[Int, Int], color: String) => {
    val xValue = map.values.toSeq ; val yValue = map.keys.toSeq
    val data = Seq(Bar(xValue, yValue, orientation = Orientation.Horizontal, 
                       marker = Marker(color = Color.StringColor(color),opacity = 0.6)))
    val annotations = xValue.zip(yValue).map {
        case (x, y) =>
            Annotation(
              x = x, y = y, text = x.toString,
              xanchor = Anchor.Center, yanchor = Anchor.Bottom, showarrow = false
            )
    }
    plot(data,Layout(annotations = annotations))
}

plotHorizontalBar(groupRulesByLength(regionTotalTaskRulesetMinLift, 4, 6), "green")
In [24]:
val makeClusters = (r: Ruleset, minNeighbours: Int, minSimilarity: Double) => r.makeClusters {
    implicit val ruleSimilarityCounting: SimilarityCounting[Rule.Simple] = SimilarityCounting.AtomsSimilarityCounting
    DbScan(minNeighbours = minNeighbours, minSimilarity = minSimilarity)
}.cache

val regionTotalTaskRulesetClustered = makeClusters(regionTotalTaskRulesetMinLift, 3, 0.85)
regionTotalTaskRulesetClustered.export("../rulesets/jaur-yago/regionTotalClustered.txt")
In [25]:
val filterByCluster = (r: ResolvedRule, cluster: Int) => r.measures.get(Measure.Cluster).get == Measure.Cluster(cluster)

val groupRulesByCluster = (r: Ruleset) => {
    var clustersMap: ListMap[Int, Int] = ListMap()
    breakable {
        var counter = 0
        while (true) {
            val size = r.filterResolved(r => filterByCluster(r,counter)).size
            if (size > 0) {clustersMap = clustersMap + (counter -> size) ; counter = counter + 1}
            else break;
        }
    }
    ListMap(clustersMap.toSeq.sortWith(_._2 > _._2):_*)
}

plotHorizontalBar(groupRulesByCluster(regionTotalTaskRulesetClustered), "green")

Regions By Sex

In [26]:
val regionBySexPattern = (
    AtomPattern(subject = 'b', graph = uri("yago")) &:
    AtomPattern(subject = 'a', predicate = refArea, `object` = 'b', graph = uri("czso")) &:
    AtomPattern(subject = 'a', predicate = qbDataSet, `object` = regionBySexSliceUri, graph = uri("czso"))
    =>: 
    AtomPattern(subject = 'a',predicate = oneOfBySexMeasures, graph = uri("czso"))
)
In [27]:
val regionBySexTask = Amie()
    .addThreshold(Threshold.MinSupport(minSupport(regionBySexSlice)))
    .addThreshold(Threshold.MaxRuleLength(6))
    .addThreshold(Threshold.MinHeadSize(1))
    .addConstraint(constantsOnlyAtObject)
    .addPattern(regionBySexPattern)
In [28]:
val startTimeMillis = System.currentTimeMillis()
val regionBySexTaskRuleset = regionBySexIndex.mine(regionBySexTask)
println("rules: "+regionBySexTaskRuleset.size)
println("duration: " + (System.currentTimeMillis() - startTimeMillis) / 1000 + "s")
2021-05-20 16:58:47:132 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Predicates trimming.
2021-05-20 16:58:47:811 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Subjects indexing.
2021-05-20 16:58:48:577 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Subjects trimming.
2021-05-20 16:58:48:859 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Objects indexing.
2021-05-20 16:58:49:569 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Objects trimming.
2021-05-20 16:58:49:661 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Amie task settings:
MinHeadSize=1,
MinHeadCoverage=0.0,
MinSupport=18,
MaxThreads=4,
MinAtomSize=0,
MaxRuleLength=6,
WithConstants=true,
ConstantsPosition=Object,
Timeout=-1,
WithDuplicitPredicates=true,
Patterns=List(Mapped(Vector(Mapped(Variable(?b),Any,Any,Constant(Constant(-1596161386))), Mapped(Variable(?a),Constant(Constant(1481837794)),Variable(?b),Constant(Constant(1015601977))), Mapped(Variable(?a),Constant(Constant(624690160)),Constant(Constant(-1890460990)),Constant(Constant(1015601977)))),Some(Mapped(Variable(?a),OneOf(ArrayBuffer(Constant(Constant(-2070273298)), Constant(Constant(1659106226)))),Any,Constant(Constant(1015601977)))),false,false)),
OnlyPredicates=None,
WithoutPredicates=None
rules: 7193
duration: 15s
In [29]:
regionBySexTaskRuleset.export("../rulesets/jaur-yago/RegionBySex.txt")
In [30]:
val regionBySexTaskRulesetFiltered = filterRuleset(regionBySexTaskRuleset).cache
regionBySexTaskRulesetFiltered.export("../rulesets/jaur-yago/RegionBySexFiltered.txt")
println("rules: " + regionBySexTaskRulesetFiltered.size)
rules: 7005
In [31]:
val supportSeq = measureSequence(regionBySexTaskRulesetFiltered, Measure.Support)
.map(m => m.asInstanceOf[Measure.Support].value.toDouble)
plotHistogram(supportSeq, "grey")
In [32]:
val regionBySexTaskRulesetConfComputed = computeConfidence(regionBySexTaskRulesetFiltered,0.0).cache
regionBySexTaskRulesetConfComputed.export("../rulesets/jaur-yago/RegionBySexConfComputed.txt")

val confSeq = measureSequence(regionBySexTaskRulesetConfComputed, Measure.PcaConfidence)
.map(m => m.asInstanceOf[Measure.PcaConfidence].value)
plotHistogram(confSeq, "grey")
In [33]:
val regionBySexTaskRulesetLiftComputed = computeLift(regionBySexTaskRulesetFiltered,0.0).cache
regionBySexTaskRulesetLiftComputed.export("../rulesets/jaur-yago/RegionBySexLiftComputed.txt")

val liftSeq = measureSequence(regionBySexTaskRulesetLiftComputed, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "grey")
In [34]:
val regionBySexTaskRulesetMinLift = regionBySexTaskRulesetLiftComputed
.filter(r => r.measures.get(Measure.Lift).get.value >= 1.0)
.pruned(false, false)
.cache

regionBySexTaskRulesetMinLift.export("../rulesets/jaur-yago/regionBySexMinLift.txt")
println("rules: " + regionBySexTaskRulesetMinLift.size)

val liftSeq = measureSequence(regionBySexTaskRulesetMinLift, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "grey")
rules: 113
In [35]:
plotHorizontalBar(groupRulesByLength(regionBySexTaskRulesetMinLift, 4, 6), "grey")
In [36]:
val regionBySexTaskRulesetClustered = makeClusters(regionBySexTaskRulesetMinLift, 3, 0.85)
regionBySexTaskRulesetClustered.export("../rulesets/jaur-yago/regionBySexClustered.txt")

plotHorizontalBar(groupRulesByCluster(regionBySexTaskRulesetClustered), "grey")

Districts Total

In [37]:
val districtTotalPattern = (
    AtomPattern(subject = 'b', graph = uri("yago")) &:
    AtomPattern(subject = 'a', predicate = refArea, `object` = 'b', graph = uri("czso")) &:
    AtomPattern(subject = 'a', predicate = qbDataSet, `object` = districtTotalSliceUri, graph = uri("czso"))
    =>: 
    AtomPattern(subject = 'a', predicate = oneOfAllMeasures, graph = uri("czso"))
)
In [38]:
val districtTotalTask = Amie()
    .addThreshold(Threshold.MinSupport(minSupport(districtTotalSlice)*3))
    .addThreshold(Threshold.MaxRuleLength(6))
    .addConstraint(constantsOnlyAtObject)
    .addPattern(districtTotalPattern)
In [39]:
val startTimeMillis = System.currentTimeMillis()
val districtTotalTaskRuleset = districtTotalIndex.mine(districtTotalTask)
println("rules: "+districtTotalTaskRuleset.size)
println("duration: " + (System.currentTimeMillis() - startTimeMillis) / 1000 + "s")
2021-05-20 16:59:35:948 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Predicates trimming.
2021-05-20 16:59:36:859 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Subjects indexing.
2021-05-20 16:59:37:918 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Subjects trimming.
2021-05-20 16:59:38:232 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Objects indexing.
2021-05-20 16:59:38:906 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Objects trimming.
2021-05-20 16:59:39:033 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Amie task settings:
MinHeadSize=100,
MinHeadCoverage=0.0,
MinSupport=27,
MaxThreads=4,
MinAtomSize=0,
MaxRuleLength=6,
WithConstants=true,
ConstantsPosition=Object,
Timeout=-1,
WithDuplicitPredicates=true,
Patterns=List(Mapped(Vector(Mapped(Variable(?b),Any,Any,Constant(Constant(-1596161386))), Mapped(Variable(?a),Constant(Constant(1481837794)),Variable(?b),Constant(Constant(1015601977))), Mapped(Variable(?a),Constant(Constant(624690160)),Constant(Constant(2112950736)),Constant(Constant(1015601977)))),Some(Mapped(Variable(?a),OneOf(ArrayBuffer(Constant(Constant(-2070273298)), Constant(Constant(1659106226)), Constant(Constant(1797717682)), Constant(Constant(1142069620)))),Any,Constant(Constant(1015601977)))),false,false)),
OnlyPredicates=None,
WithoutPredicates=None
rules: 61964
duration: 76s
In [40]:
districtTotalTaskRuleset.export("../rulesets/jaur-yago/districtTotal.txt")
In [41]:
val districtTotalTaskRulesetFiltered = filterRuleset(districtTotalTaskRuleset).cache
districtTotalTaskRulesetFiltered.export("../rulesets/jaur-yago/districtTotalFiltered.txt")
println("rules: " + districtTotalTaskRulesetFiltered.size)
rules: 61892
In [42]:
val supportSeq = measureSequence(districtTotalTaskRulesetFiltered, Measure.Support)
.map(m => m.asInstanceOf[Measure.Support].value.toDouble)
plotHistogram(supportSeq, "red")
In [43]:
val districtTotalTaskRulesetConfComputed = computeConfidence(districtTotalTaskRulesetFiltered,0.0).cache
districtTotalTaskRulesetConfComputed.export("../rulesets/jaur-yago/districtTotalConfComputed.txt")
val confSeq = measureSequence(districtTotalTaskRulesetConfComputed, Measure.PcaConfidence)
.map(m => m.asInstanceOf[Measure.PcaConfidence].value)
.filter(c => c <= 1)
plotHistogram(confSeq, "red")
In [44]:
val districtTotalTaskRulesetLiftComputed = computeLift(districtTotalTaskRulesetFiltered,0.0).cache
districtTotalTaskRulesetLiftComputed.export("../rulesets/jaur-yago/districtTotalLiftComputed.txt")

val liftSeq = measureSequence(districtTotalTaskRulesetLiftComputed, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "red")
In [45]:
val districtTotalTaskRulesetMinLift = districtTotalTaskRulesetLiftComputed
.filter(r => r.measures.get(Measure.Lift).get.value > 1.0)
.pruned(false, false)
.cache

districtTotalTaskRulesetMinLift.export("../rulesets/jaur-yago/districtTotalMinLift.txt")
println("rules: " + districtTotalTaskRulesetMinLift.size)

val liftSeq = measureSequence(districtTotalTaskRulesetMinLift, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "red")
rules: 942
In [46]:
val districtTotalTaskRulesetClustered = makeClusters(districtTotalTaskRulesetMinLift, 3, 0.85)
districtTotalTaskRulesetClustered.export("../rulesets/jaur-yago/districtTotalClustered.txt")

plotHorizontalBar(groupRulesByCluster(districtTotalTaskRulesetClustered), "red")

Districts By Sex

In [47]:
val districtBySexPattern = (
    AtomPattern(subject = 'b', graph = uri("yago")) &:
    AtomPattern(subject = 'a', predicate = refArea, `object` = 'b', graph = uri("czso")) &:
    AtomPattern(subject = 'a', predicate = qbDataSet, `object` = districtBySexSliceUri, graph = uri("czso"))
    =>: 
    AtomPattern(subject = 'a', predicate = oneOfBySexMeasures, graph = uri("czso"))
)
In [48]:
val districtBySexTask = Amie()
    .addThreshold(Threshold.MinSupport(minSupport(districtBySexSlice)*3))
    .addThreshold(Threshold.MaxRuleLength(6))
    .addConstraint(constantsOnlyAtObject)
    .addPattern(districtBySexPattern)
In [49]:
val startTimeMillis = System.currentTimeMillis()
val districtBySexTaskRuleset = districtBySexIndex.mine(districtBySexTask)
println("rules: "+districtBySexTaskRuleset.size)
println("duration: " + (System.currentTimeMillis() - startTimeMillis) / 1000 + "s")
2021-05-20 17:10:18:280 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Predicates trimming.
2021-05-20 17:10:18:659 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Subjects indexing.
2021-05-20 17:10:22:279 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Subjects trimming.
2021-05-20 17:10:22:572 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Objects indexing.
2021-05-20 17:10:23:278 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Objects trimming.
2021-05-20 17:10:23:374 +0200 [scala-interpreter-1] INFO com.github.propi.rdfrules.utils.Debugger - Amie task settings:
MinHeadSize=100,
MinHeadCoverage=0.0,
MinSupport=54,
MaxThreads=4,
MinAtomSize=0,
MaxRuleLength=6,
WithConstants=true,
ConstantsPosition=Object,
Timeout=-1,
WithDuplicitPredicates=true,
Patterns=List(Mapped(Vector(Mapped(Variable(?b),Any,Any,Constant(Constant(-1596161386))), Mapped(Variable(?a),Constant(Constant(1481837794)),Variable(?b),Constant(Constant(1015601977))), Mapped(Variable(?a),Constant(Constant(624690160)),Constant(Constant(-1288949022)),Constant(Constant(1015601977)))),Some(Mapped(Variable(?a),OneOf(ArrayBuffer(Constant(Constant(-2070273298)), Constant(Constant(1659106226)))),Any,Constant(Constant(1015601977)))),false,false)),
OnlyPredicates=None,
WithoutPredicates=None
rules: 29368
duration: 60s
In [50]:
districtBySexTaskRuleset.export("../rulesets/jaur-yago/districtBySex.txt")
In [51]:
val districtBySexTaskRulesetFiltered = filterRuleset(districtBySexTaskRuleset).cache
districtBySexTaskRulesetFiltered.export("../rulesets/jaur-yago/districtBySexFiltered.txt")
println("rules: " + districtBySexTaskRulesetFiltered.size)
rules: 29313
In [52]:
val supportSeq = measureSequence(districtBySexTaskRulesetFiltered, Measure.Support)
.map(m => m.asInstanceOf[Measure.Support].value.toDouble)
plotHistogram(supportSeq, "cls")
In [53]:
val districtBySexTaskRulesetConfComputed = computeConfidence(districtBySexTaskRulesetFiltered,0.0).cache

districtBySexTaskRulesetConfComputed.export("../rulesets/jaur-yago/districtBySexConfComputed.txt")

val confSeq = measureSequence(districtBySexTaskRulesetConfComputed, Measure.PcaConfidence)
.map(m => m.asInstanceOf[Measure.PcaConfidence].value)
.filter(c => c <= 1)
plotHistogram(confSeq, "cls")
In [54]:
// todo compute lift ...
val districtBySexTaskRulesetLiftComputed = computeLift(districtBySexTaskRulesetFiltered,0.0).cache
districtBySexTaskRulesetLiftComputed.export("../rulesets/jaur-yago/districtBySexLiftComputed.txt")

val liftSeq = measureSequence(districtBySexTaskRulesetLiftComputed, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "cls")
In [55]:
val districtBySexTaskRulesetMinLift = districtBySexTaskRulesetLiftComputed
.filter(r => r.measures.get(Measure.Lift).get.value >= 1.0)
.pruned(false, false)
.cache

districtBySexTaskRulesetMinLift.export("../rulesets/jaur-yago/districtBySexMinLift.txt")
println("rules: " + districtBySexTaskRulesetMinLift.size)

val liftSeq = measureSequence(districtBySexTaskRulesetMinLift, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
//.filter(c => c <= 1)
plotHistogram(liftSeq, "cls")
rules: 417
In [56]:
val districtBySexTaskRulesetClustered = makeClusters(districtBySexTaskRulesetMinLift, 3, 0.85).cache
districtBySexTaskRulesetClustered.export("../rulesets/jaur-yago/districtBySexClustered.txt")

plotHorizontalBar(groupRulesByCluster(districtBySexTaskRulesetClustered), "cls")